/* Table 3 Panel A: bachelor-skewness regressions, school level */
libname edu '!userprofile\\Dropbox\Education\Replication\Data';

/* create a table with all years and all schools, fill in zeros when there's no record */
/* data from IPEDS */
proc sql;
  create table year as
  select distinct year_ending, major
  from edu.school;
quit;

proc sql;
  create table all as
  select a.unitid, b.year_ending, b.major
  from edu.schoolname as a, year as b;
quit;

proc sql;
  create table school as
  select a.*, b.number
  from all as a left join edu.school as b
  on a.year_ending = b.year_ending and a.major = b.major and a.unitid = b.unitid
  and b.total = 0; /* total = 0 is the aggregate */
quit;

proc sort data = school; by unitid year_ending major; run;
proc means data = school noprint;
  var number;
  by unitid year_ending major;
  output out = school_total sum = number;
run;

data school; set school_total; if year_ending < 2001 then delete; run;

/* delete school-major if they are all zeros */
proc sort data = school; by unitid major; run;
proc means data = school noprint;
  by unitid major;
  var number;
  output out = check sum = total;
run;

proc sql;
  create table school as
  select a.*, b.total
  from school as a, check as b
  where a.unitid = b.unitid and a.major = b.major;
quit;

data school; set school; 
if total = 0 then delete; 
run;

/* delete schools if they do not have enough majors */
data school_temp;
  set school;
  if number = 0 then delete;
run;

proc sort data = school_temp; by unitid year_ending; run;

proc means data = school_temp noprint;
  var major;
  by unitid year_ending;
  output out = major n = num_major;
run;

data major;
  set major;
  if num_major >= 5; /* require the schools to offer at least 5 majors */
run;

proc sql; create table school as select a.*, b.num_major from school as a left join major as b on a.unitid = b.unitid and a.year_ending = b.year_ending; quit;
data school (drop = num_major);
  set school;
  if num_major ~= .;
run;

/* merge with school level data */
proc sql;
  create table school as
  select a.*, b.state2 as state, b.schoolname
  from school as a, edu.schoolname as b
  where a.unitid = b.unitid;
quit;

data school;
  set school;
  log_bachelor = log(number+1);
run;

/* merge with ranking data from US News */
data schoolranking;
  set edu.schoolranking;
  if 1<= rank <= 50;
run;

proc sql;
  create table school as
  select a.*, b.rank as usnews
  from school as a left join schoolranking as b
  on a.schoolname = b.University_Name and a.year_ending - b.year = 4;
quit;

/* look at which state top employers are located for each major */
DATA crsp;  /*read all the crsp monthly data with nonmissing return*/
set edu.crsp_data;
IF not missing (RET);
if RET ne 'C';
if RET ne 'B';
month=month(date);
if 1<=EXCHCD<=3 and SHRCD in (10 11);
naics_3=substr(naics,1,3);
naics_33 = naics_3+1-1;
sic_2=floor(siccd/100);
if month = 12;
run;

* get # employees, if there are duplicate entries, choose the larger and the most recent;
data emp (keep = gvkey lpermno datadate year emp); set edu.crspcompustat_data; year = year(datadate); run; 

proc sort data = emp; by lpermno year descending datadate descending emp; run;
proc sort data = emp nodupkey; by lpermno gvkey year; run;

* get state information for each gvkey;
proc sql;
  create table emp2 as
  select distinct a.*, b.state
  from emp as a, edu.crspcompustat_location as b
  where a.gvkey = b.gvkey and a.year = year(b.datadate);
quit;

* get industry and major for each gvkey;
proc sql;
  create table emp3 as
  select a.*, b.sic_2
  from emp2 as a, crsp as b
  where a.lpermno = b.permno and a.year = year(b.date) and a.year < 2005;
quit;

proc sql;
  create table emp4 as
  select a.*, b.naics_33
  from emp2 as a, crsp as b
  where a.lpermno = b.permno and a.year = year(b.date) and a.year >= 2005;
quit;

proc sql;
  create table emp5 as
  select distinct a.*, b.major
  from emp3 as a, edu.sic_2_major as b
  where a.sic_2 = b.sic_2 and major ~= .;
quit;

proc sql;
  create table emp6 as
  select distinct a.*, b.major
  from emp4 as a, edu.naics_3_major as b
  where a.naics_33 = b.naics_3 and major ~= .;
quit;

proc append base = emp5 data = emp6 force; run;

* identify the top state in total employment for each major;
proc sort data = emp5; by major year state; run;
proc means data = emp5 noprint;
  var emp;
  by major year state;
  output out = emp6 sum = emp;
run;

proc sort data = emp6; by major year descending emp; run;

data topemp;
  set emp6;
  by major year;
  if first.year;
run;

* merge with school data;
proc sql;
  create table school as
  select distinct a.*, b.state as top
  from school as a left join topemp as b
  on a.state = b.state and a.major = b.major and a.year_ending - 4 = b.year;
quit;

data school;
  set school;
  rank = 0;
  if top ~= "" and 1<= usnews <= 50 then rank = 1;
  log_bachelor = log(number+1);
run;

/* run school level regressions */
/* merge with regression_avg2e_major to get RHS variables */
proc sql;
  create table regression_avg4 as
  select a.*, b.skew1e_major_avg2, b.log_mktcape_major, b.log_btme_major, b.log_agee_major, b.tsmean_major_avg2, b.tsvol_major_avg2
  from school as a, edu.regression_avg2e_major as b
  where a.year_ending = b.year_ending and a.major = b.majorcode;
quit;

/* merge with wage data (average industry entry-level wage and average major entry-level wage) */
data wage (keep = year major annual_wage);
  set edu.wagesoc;
  if year < 1999 then delete;
run;

data wage2 (keep = year major annual_wage);
  set edu.compustat_wage;
  if year >= 1999 then delete;
  annual_wage = annual_wagee_major;
run;

proc append base = wage2 data = wage; run;

proc sql;
  create table regression_avg4 as
  select a.*, b.annual_wage as annual_wage_avg2, b.year
  from regression_avg4 as a left join wage2 as b
  on a.major = b.major and a.year_ending - b.year = 3;
quit;

* standardize;
proc standard data = regression_avg4 out = regression_avg3 mean = 0 std = 1; run;

data regression_avg4; set regression_avg4; num = _N_; run;
data regression_avg3 (drop = log_bachelor unitid year_ending major rank); set regression_avg3; num = _N_; run;

proc sql;
  create table regression_avg4 as
  select a.*, b.log_bachelor, b.unitid, b.year_ending, b.major, b.rank
  from regression_avg3 as a, regression_avg4 as b
  where a.num = b.num;
quit;

data regression_avg4 (drop = number total sector num);
  set regression_avg4;
  year_ending2 = year_ending;
  if rank = 0 then year_ending2 = year_ending2+10000;
  skew_top = skew1e_major_avg2*rank; 
run;

proc surveyreg data = regression_avg4;
  class 
  year_ending2
  major
;
  cluster year_ending;
  model log_bachelor= 
  skew1e_major_avg2

  tsmean_major_avg2
  tsvol_major_avg2
  annual_wage_avg2
  log_mktcape_major
  log_btme_major
  log_agee_major

  year_ending2
  major
  /solution adjrsq;
run;

proc surveyreg data = regression_avg4;
  class 
  year_ending2
  major
;
  cluster year_ending;
  model log_bachelor= 
  skew1e_major_avg2
  skew_top

  tsmean_major_avg2
  tsvol_major_avg2
  annual_wage_avg2
  log_mktcape_major
  log_btme_major
  log_agee_major

  year_ending2
  major
  /solution adjrsq;
run;

